The purpose of this file is to practice applying basic R commands learned through lecture videos and perform exploratory analysis on worldwide COVID-19 data.

library(dplyr)
library(lubridate)
library(knitr)
library(ggplot2)
library(cowplot)

Our World in Data GitHub COVID-19 outcomes

Data was read in and summarized as shown below: (current file is as of 5/31)

owid <- read.csv(url("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"), na.strings = c("", "NA"), colClasses = c("date" = "Date"), header = TRUE)
dim(owid)
## [1] 94441    59
head(owid)
str(owid)
summary(owid)

Full Cleaned Data Set

Data set, including possible confounders for use in later analysis, is cleaned and filtered below.

owid.filtered <- owid %>%
                  rename(iso.code = iso_code, totcases = total_cases, totdeaths = total_deaths, totcases.mil = total_cases_per_million, totdeaths.mil = total_deaths_per_million, pop.density = population_density, med.age = median_age, older70 = aged_70_older, gdp.cap = gdp_per_capita, card.death = cardiovasc_death_rate, diab.prev = diabetes_prevalence, fem.smokers = female_smokers, male.smokers = male_smokers) %>%
                  select(c(iso.code:totcases, totdeaths, totcases.mil, totdeaths.mil, population, pop.density, med.age, older70, gdp.cap, card.death, diab.prev, fem.smokers, male.smokers)) %>% #Renamed variables and filtered to keep only those of interest. 
                  filter(date == "2020-05-31") #Filtered out all but most recent date (most recent date is a running sum of total cases).
table(owid.filtered$continent, useNA = "ifany") #Data includes both continent and country-specific outcomes. Continent outcomes can be distinguished by "NA" in the continent column; the continent name is in the location column. 
## 
##        Africa          Asia        Europe North America       Oceania 
##            54            47            46            23             4 
## South America          <NA> 
##            12             9

More exploratory analysis of data frames:

dim(owid.filtered)
head(owid.filtered)
tail(owid.filtered)

Continent Analysis

Data is filtered to remove entries for specific country locations and include only data that aggregates COVID-19 outcomes on a continent and world scale.

owid.continents <- owid.filtered %>%
                    filter(is.na(continent)) %>%
                    select(-continent) #Used missingness properties to select only continent rows and then removed continent column. 
table(owid.continents$location)
## 
##         Africa           Asia         Europe European Union  International 
##              1              1              1              1              1 
##  North America        Oceania  South America          World 
##              1              1              1              1

Country Analysis

Data is filtered to include only country-specific COVID-19 outcomes and exclude continent and world-wide data.

owid.countries <- owid.filtered %>%
                    filter(!is.na(continent)) #Removed all rows with NA for continent, leaving behind only countries. 
table(owid.countries$continent)
## 
##        Africa          Asia        Europe North America       Oceania 
##            54            47            46            23             4 
## South America 
##            12

Tests for Missingness

table(owid.filtered$totcases, useNA = "ifany") %>%
  tail() #No missing values for total cases column. 
## 
## 1138778 1160536 1798793 1959812 2031973 6188259 
##       1       1       1       1       1       1
table(owid.filtered$totdeaths, useNA = "ifany") %>%
  tail() #Several missing values for total deaths column. Keep this in mind when analyzing. 
## 
## 107861 126360 127104 172683 391267   <NA> 
##      1      1      1      1      1     19
which(is.na(owid.filtered$totdeaths))
##  [1]  22  31  51  57  63  72  98 101 118 123 138 148 149 150 156 177 182 189 191
#Missing countries: Butan, Cambodia, Dominica, Eritrea, Fiji, Grenada, Laos, Lesotho, Mongolia, Namibia, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Seychelles, Timor, Uganda, Vatican, Vietnam.

#Missingness for confounders: 
owid.countries[!complete.cases(owid.countries), ] #Most missingness in smoking categories. 
##     iso.code     continent                         location       date totcases
## 1        AFG          Asia                      Afghanistan 2020-05-31    15208
## 4        AND        Europe                          Andorra 2020-05-31      764
## 5        AGO        Africa                           Angola 2020-05-31       86
## 6        ATG North America              Antigua and Barbuda 2020-05-31       26
## 18       BLZ North America                           Belize 2020-05-31       18
## 20       BTN          Asia                           Bhutan 2020-05-31       43
## 21       BOL South America                          Bolivia 2020-05-31     9982
## 28       BDI        Africa                          Burundi 2020-05-31       63
## 29       KHM          Asia                         Cambodia 2020-05-31      125
## 30       CMR        Africa                         Cameroon 2020-05-31     5904
## 33       CAF        Africa         Central African Republic 2020-05-31     1011
## 34       TCD        Africa                             Chad 2020-05-31      778
## 41       CIV        Africa                    Cote d'Ivoire 2020-05-31     2833
## 43       CUB North America                             Cuba 2020-05-31     2045
## 46       COD        Africa     Democratic Republic of Congo 2020-05-31     3070
## 49       DMA North America                         Dominica 2020-05-31       16
## 54       GNQ        Africa                Equatorial Guinea 2020-05-31     1306
## 55       ERI        Africa                          Eritrea 2020-05-31       39
## 59       FJI       Oceania                             Fiji 2020-05-31       18
## 62       GAB        Africa                            Gabon 2020-05-31     2655
## 68       GRD North America                          Grenada 2020-05-31       23
## 69       GTM North America                        Guatemala 2020-05-31     5087
## 70       GIN        Africa                           Guinea 2020-05-31     3706
## 71       GNB        Africa                    Guinea-Bissau 2020-05-31     1256
## 72       GUY South America                           Guyana 2020-05-31      153
## 74       HND North America                         Honduras 2020-05-31     5202
## 75       HKG          Asia                        Hong Kong 2020-05-31     1084
## 81       IRQ          Asia                             Iraq 2020-05-31     6439
## 87       JOR          Asia                           Jordan 2020-05-31      739
## 90  OWID_KOS        Europe                           Kosovo 2020-05-31     1070
## 93       LAO          Asia                             Laos 2020-05-31       19
## 96       LSO        Africa                          Lesotho 2020-05-31        2
## 98       LBY        Africa                            Libya 2020-05-31      156
## 99       LIE        Europe                    Liechtenstein 2020-05-31       82
## 102      MDG        Africa                       Madagascar 2020-05-31      771
## 108      MRT        Africa                       Mauritania 2020-05-31      530
## 112      MCO        Europe                           Monaco 2020-05-31       99
## 113      MNG          Asia                         Mongolia 2020-05-31      179
## 118      NAM        Africa                          Namibia 2020-05-31       24
## 122      NIC North America                        Nicaragua 2020-05-31      759
## 125      MKD        Europe                  North Macedonia 2020-05-31     2226
## 129      PSE          Asia                        Palestine 2020-05-31      448
## 131      PNG       Oceania                 Papua New Guinea 2020-05-31        8
## 133      PER South America                             Peru 2020-05-31   164476
## 141      KNA North America            Saint Kitts and Nevis 2020-05-31       15
## 142      LCA North America                      Saint Lucia 2020-05-31       18
## 143      VCT North America Saint Vincent and the Grenadines 2020-05-31       26
## 144      SMR        Europe                       San Marino 2020-05-31      671
## 145      STP        Africa            Sao Tome and Principe 2020-05-31      483
## 148      SRB        Europe                           Serbia 2020-05-31    11412
## 149      SYC        Africa                       Seychelles 2020-05-31       11
## 154      SOM        Africa                          Somalia 2020-05-31     1976
## 157      SSD        Africa                      South Sudan 2020-05-31      994
## 160      SDN        Africa                            Sudan 2020-05-31     4800
## 164      SYR          Asia                            Syria 2020-05-31      122
## 165      TWN          Asia                           Taiwan 2020-05-31      442
## 166      TJK          Asia                       Tajikistan 2020-05-31     3930
## 169      TLS          Asia                            Timor 2020-05-31       24
## 171      TTO North America              Trinidad and Tobago 2020-05-31      117
## 174      UGA        Africa                           Uganda 2020-05-31      417
## 181      VAT        Europe                          Vatican 2020-05-31       12
## 182      VEN South America                        Venezuela 2020-05-31     1510
## 183      VNM          Asia                          Vietnam 2020-05-31      328
##     totdeaths totcases.mil totdeaths.mil population pop.density med.age older70
## 1         258      390.667         6.628   38928341      54.422    18.6   1.337
## 4          51     9888.048       660.066      77265     163.755      NA      NA
## 5           4        2.617         0.122   32866268      23.890    16.8   1.362
## 6           3      265.501        30.635      97928     231.845    32.1   4.631
## 18          2       45.269         5.030     397621      16.426    25.0   2.279
## 20         NA       55.727            NA     771612      21.188    28.6   2.977
## 21        313      855.134        26.814   11673029      10.202    25.4   4.393
## 28          1        5.298         0.084   11890781     423.062    17.5   1.504
## 29         NA        7.477            NA   16718971      90.672    25.6   2.385
## 30        191      222.408         7.195   26545864      50.885    18.8   1.919
## 33          2      209.327         0.414    4829764       7.479    18.3   2.251
## 34         65       47.364         3.957   16425859      11.833    16.7   1.446
## 41         33      107.399         1.251   26378275      76.399    18.7   1.582
## 43         83      180.548         7.328   11326616     110.408    43.1   9.719
## 46         72       34.278         0.804   89561404      35.879    17.0   1.745
## 49         NA      222.250            NA      71991      98.567      NA      NA
## 54         12      930.872         8.553    1402985      45.194    22.4   1.752
## 55         NA       10.997            NA    3546427      44.304    19.3   2.171
## 59         NA       20.079            NA     896444      49.562    28.6   3.284
## 62         17     1192.868         7.638    2225728       7.859    23.1   2.976
## 68         NA      204.410            NA     112519     317.132    29.4   5.021
## 69        108      283.943         6.028   17915567     157.834    22.9   3.016
## 70         23      282.194         1.751   13132792      51.755    19.0   1.733
## 71          8      638.212         4.065    1967998      66.191    19.4   1.565
## 72         12      194.518        15.256     786559       3.952    26.3   2.837
## 74        212      525.210        21.404    9904608      82.805    24.9   2.883
## 75          4      144.591         0.534    7496988    7039.714    44.8  10.158
## 81        205      160.085         5.097   40222503      88.125    20.0   1.957
## 87          9       72.429         0.882   10203140     109.285    23.2   2.361
## 90         30      553.608        15.522    1932774     168.155      NA      NA
## 93         NA        2.611            NA    7275556      29.715    24.4   2.322
## 96         NA        0.934            NA    2142252      73.562    22.2   2.647
## 98          5       22.703         0.728    6871287       3.623    29.0   2.816
## 99          1     2150.143        26.221      38137     237.012      NA      NA
## 102         6       27.843         0.217   27691019      43.951    19.6   1.686
## 108        23      113.987         4.947    4649660       4.289    20.3   1.792
## 112         4     2522.679       101.926      39244   19347.500      NA      NA
## 113        NA       54.602            NA    3278292       1.980    28.6   2.421
## 118        NA        9.445            NA    2540916       3.078    22.0   2.085
## 122        35      114.574         5.283    6624554      51.667    27.3   3.519
## 125       133     1068.456        63.839    2083380      82.600    39.1   8.160
## 129         3       87.819         0.588    5101416     778.202    20.4   1.726
## 131        NA        0.894            NA    8947027      18.220    22.6   2.142
## 133     20710     4988.377       628.112   32971846      25.129    29.1   4.455
## 141        NA      281.997            NA      53192     212.865      NA      NA
## 142        NA       98.024            NA     183629     293.187    34.9   6.405
## 143        NA      234.346            NA     110947     281.787    31.8   4.832
## 144        42    19771.348      1237.551      33938     556.667      NA      NA
## 145        12     2203.859        54.754     219161     212.841    18.7   2.162
## 148       243     1677.102        35.711    6804596      80.291    41.2      NA
## 149        NA      111.857            NA      98340     208.354    36.2   5.586
## 154        78      124.330         4.908   15893219      23.500    16.8   1.496
## 157        10       88.800         0.893   11193729          NA    19.2   2.032
## 160       286      109.466         6.522   43849269      23.258    19.7   2.034
## 164         5        6.971         0.286   17500657          NA    21.7   2.577
## 165         7       18.558         0.294   23816775          NA    42.2   8.353
## 166        47      412.052         4.928    9537642      64.281    23.3   2.155
## 169        NA       18.203            NA    1318442      87.176    18.0   1.897
## 171         8       83.602         5.716    1399491     266.886    36.2   5.819
## 174        NA        9.117            NA   45741000     213.759    16.4   1.308
## 181        NA    14833.127            NA        809          NA      NA      NA
## 182        14       53.102         0.492   28435943      36.253    29.0   3.915
## 183        NA        3.370            NA   97338583     308.127    32.6   4.718
##       gdp.cap card.death diab.prev fem.smokers male.smokers
## 1    1803.987    597.029      9.59          NA           NA
## 4          NA    109.135      7.97        29.0         37.8
## 5    5819.495    276.045      3.94          NA           NA
## 6   21490.943    191.511     13.17          NA           NA
## 18   7824.362    176.957     17.11          NA           NA
## 20   8708.597    217.066      9.75          NA           NA
## 21   6885.829    204.299      6.89          NA           NA
## 28    702.225    293.068      6.05          NA           NA
## 29   3645.070    270.892      4.00         2.0         33.7
## 30   3364.926    244.661      7.20          NA           NA
## 33    661.240    435.727      6.10          NA           NA
## 34   1768.153    280.995      6.10          NA           NA
## 41   3601.006    303.740      2.42          NA           NA
## 43         NA    190.968      8.27        17.1         53.3
## 46    808.133    318.949      6.10          NA           NA
## 49   9673.367    227.376     11.62          NA           NA
## 54  22604.873    202.812      7.78          NA           NA
## 55   1510.459    311.110      6.05         0.2         11.4
## 59   8702.975    412.820     14.49        10.2         34.8
## 62  16562.413    259.967      7.20          NA           NA
## 68  13593.877    243.964     10.71          NA           NA
## 69   7423.808    155.898     10.18          NA           NA
## 70   1998.926    336.717      2.42          NA           NA
## 71   1548.675    382.474      2.42          NA           NA
## 72   7435.047    373.159     11.62          NA           NA
## 74   4541.795    240.208      7.21         2.0           NA
## 75  56054.920         NA      8.33          NA           NA
## 81  15663.986    218.612      8.83          NA           NA
## 87   8337.490    208.257     11.75          NA           NA
## 90   9795.834         NA        NA          NA           NA
## 93   6397.360    368.111      4.00         7.3         51.2
## 96   2851.153    405.126      3.94         0.4         53.9
## 98  17881.509    341.862     10.43          NA           NA
## 99         NA         NA      7.77          NA           NA
## 102  1416.440    405.994      3.94          NA           NA
## 108  3597.633    232.347      2.42          NA           NA
## 112        NA         NA      5.46          NA           NA
## 113 11840.846    460.043      4.82         5.5         46.5
## 118  9541.808    243.811      3.94         9.7         34.2
## 122  5321.444    137.016     11.47          NA           NA
## 125 13111.214    322.688     10.08          NA           NA
## 129  4449.898    265.910     10.59          NA           NA
## 131  3823.194    561.494     17.65        23.5         48.8
## 133 12236.706     85.755      5.95         4.8           NA
## 141 24654.385         NA     12.84          NA           NA
## 142 12951.839    204.620     11.62          NA           NA
## 143 10727.146    252.675     11.62          NA           NA
## 144 56861.470         NA      5.64          NA           NA
## 145  3052.714    270.113      2.42          NA           NA
## 148 14048.881    439.415     10.08        37.7         40.2
## 149 26382.287    242.648     10.55         7.1         35.7
## 154        NA    365.769      6.05          NA           NA
## 157  1569.888    280.775     10.43          NA           NA
## 160  4466.507    431.388     15.67          NA           NA
## 164        NA    376.264        NA          NA           NA
## 165        NA    103.957        NA          NA           NA
## 166  2896.913    427.698      7.11          NA           NA
## 169  6570.102    335.346      6.86         6.3         78.1
## 171 28763.071    228.467     10.97          NA           NA
## 174  1697.707    213.333      2.50         3.4         16.7
## 181        NA         NA        NA          NA           NA
## 182 16745.022    204.850      6.47          NA           NA
## 183  6171.884    245.465      6.00         1.0         45.9

Modified Data Frames

Filtered OWID data frames further narrowed to COVID-19 cases only, deaths only, and hospitalizations only, separated by continent and by country respectively.

#By continent:
select(owid.continents, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by continent. 
select(owid.continents, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by continent. 
select(owid.continents, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by continent. 

#By country:
select(owid.countries, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by country. 
select(owid.countries, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by country. 
select(owid.countries, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by country.  

Case Count Plots

Continent COVID-19 Plots

Plots for exploratory analysis comparing COVID-19 cases by continent for total cases, total cases per million people, total deaths, and total deaths per million people.

continents1 <- owid.continents %>%
                  filter(!(location %in% c("International", "World"))) %>%
                  ggplot(aes(x = location, y = totcases)) +
                  geom_bar(stat="identity") +
                  geom_text(aes(label = totcases), vjust = -0.25, size = 3) + 
                  labs(title = "Total COVID-19 Cases by Continent") +
                  labs(x = "Continent", y = "Total COVID-19 Cases") +
                  theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases by continent. 

continents2 <- owid.continents %>%
                  filter(!(location %in% c("International", "World"))) %>%
                  ggplot(aes(x = location, y = totcases.mil)) +
                  geom_bar(stat="identity") + 
                  geom_text(aes(label = totcases.mil), vjust = -0.25, size = 3) + 
                  labs(title = "Total COVID-19 Cases per Million People by Continent") +
                  labs(x = "Continent", y = "Total COVID-19 Cases per Million") +
                  theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases per million by continent.

plot_grid(continents1, continents2, labels = "AUTO")

continents3 <- owid.continents %>%
                    filter(!(location %in% c("International", "World"))) %>%
                    ggplot(aes(x = location, y = totdeaths)) +
                    geom_bar(stat="identity") +
                    geom_text(aes(label = totdeaths), vjust = -0.25, size = 3) + 
                    labs(title = "Total COVID-19 Deaths by Continent") +
                    labs(x = "Continent", y = "Total COVID-19 Cases") +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths by continent. 

continents4 <- owid.continents %>%
                    filter(!(location %in% c("International", "World"))) %>%
                    ggplot(aes(x = location, y = totdeaths.mil)) +
                    geom_bar(stat="identity") +
                    geom_text(aes(label = totdeaths.mil), vjust = -0.25, size = 3) + 
                    labs(title = "Total COVID-19 Deaths per Million by Continent") +
                    labs(x = "Continent", y = "Total COVID-19 Cases") +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths per million by continent.

plot_grid(continents3, continents4, labels = "AUTO")

Top 10 Country COVID-19 Plots

Barplots filtering out the 10 countries with the highest COVID-19 cases per million people on each continent, as well as a heatmap of all countries, are the outputs of the code below:

ctry.mean.cases <- owid.countries %>%
    group_by(continent) %>%
    summarise(totcases.mil = mean(totcases.mil, na.rm = TRUE)) #Mean COVID-19 cases per country, grouped by continent.
mean.cases <- function(x) {
                      ctry.mean.cases %>%
                      filter(continent == x) %>%
                      select(totcases.mil) %>%
                      as.numeric()
} #Function to produce numeric value of mean COVID-19 cases per country by continent to be used in bar graphs below. 
top.ctry.cases <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    arrange(desc(totcases.mil)) %>%
                    head(n=10) %>%
                    ggplot(aes(x = location, y = totcases.mil)) +
                        geom_bar(stat = "identity") +
                        geom_abline(slope=0, intercept= mean.cases(x),  col = "red", lty=2) +
                        labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(top.ctry.cases(i))
} #Loops function for continents of interest. 

ctry.mean.deaths <- owid.countries %>%
    group_by(continent) %>%
    summarise(totdeaths.mil = mean(totdeaths.mil, na.rm = TRUE)) #Mean COVID-19 daeths per country, grouped by continent.
mean.deaths <- function(x) {
                      ctry.mean.deaths %>%
                      filter(continent == x) %>%
                      select(totdeaths.mil) %>%
                      as.numeric()
} #Function to produce numeric value of mean COVID-19 deaths per country by continent to be used in bar graphs below. 
top.ctry.deaths <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    arrange(desc(totdeaths.mil)) %>%
                    head(n=10) %>%
                    ggplot(aes(x = location, y = totdeaths.mil)) +
                        geom_bar(stat = "identity") +
                        geom_abline(slope=0, intercept= mean.deaths(x),  col = "red", lty=2) +
                        labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(top.ctry.deaths(i))
} #Loops function for continents of interest. 

Heatmaps of COVID-19 Cases Over Time, All Countries

owid %>%
  rename(iso.code = iso_code, totcases.mil = total_cases_per_million) %>%
  select(c(iso.code:date, totcases.mil)) %>%
  filter(!is.na(continent)) %>%
  ggplot(aes(x = date, y = location, fill = totcases.mil)) +
    geom_tile() +
    labs(title = "Total COVID-19 Cases per Million Over Time", size = 22) +
    scale_x_date(date_breaks = "1 month") +
    scale_fill_gradient(low = "#ffeda0", high = "#f03b20") +
    ggsave(file = "test.image.pdf", width = 20, height = 25) #Total cases per million over time by country as heatmap. 

owid %>%
  rename(iso.code = iso_code, new.cases = new_cases) %>%
  select(c(iso.code:date, new.cases)) %>%
  filter(!is.na(continent)) %>%
  ggplot(aes(x = date, y = location, fill = new.cases)) +
    geom_tile() +
    labs(title = "New COVID-19 Cases per Million Over Time", size = 22) +
    scale_x_date(date_breaks = "1 month") +
    scale_fill_gradient(low = "#ffeda0", high = "#f03b20", limits = c(0, 20000)) +
    ggsave(file = "test.image2.pdf", width = 20, height = 25) #New cases per million over time by country as heatmap. Difficult to scale because range is so large.  

Population Size and COVID-19 Outcomes

Population Size and COVID-19 Cases per Million

Population size and population density are evaluated below as confounding variables for COVID-19 cases. These variables are first analyzed on a full-world scale, and then by continent.

ggplot(data = owid.countries, aes(x = population, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
  labs(title = ("World COVID-19 Cases per Million People vs. Population Size")) #COVID-19 cases and population size on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$population, use = "complete.obs") 
## [1] -0.05455269
pop.lm.cases <- lm(totcases.mil ~ population, data = owid.countries)
pop.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
## 
## Coefficients:
## (Intercept)   population  
##   1.378e+03   -9.934e-07
summary.lm(pop.lm.cases) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1375.3 -1273.2 -1085.4   170.6 18393.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.378e+03  2.092e+02   6.588 4.54e-10 ***
## population  -9.934e-07  1.340e-06  -0.741     0.46    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2750 on 184 degrees of freedom
## Multiple R-squared:  0.002976,   Adjusted R-squared:  -0.002443 
## F-statistic: 0.5492 on 1 and 184 DF,  p-value: 0.4596
ggplot(data = owid.countries, aes(x = pop.density, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
  labs(title = ("World COVID-19 Cases per Million People vs. Population Density")) #COVID-19 cases and population size on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$pop.density, use = "complete.obs") 
## [1] 0.09592666
pop.den.lm.cases <- lm(totcases.mil ~ pop.density, data = owid.countries)
pop.den.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
## 
## Coefficients:
## (Intercept)  pop.density  
##    1231.377        0.152
summary.lm(pop.den.lm.cases) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2157.1 -1177.8  -994.2   130.6 18487.2 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1231.3772   195.4280   6.301 2.21e-09 ***
## pop.density    0.1520     0.1176   1.293    0.198    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2578 on 180 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.009202,   Adjusted R-squared:  0.003697 
## F-statistic: 1.672 on 1 and 180 DF,  p-value: 0.1977
cases.pop <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = population, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Size"))
} 
pop.lm.cases.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totcases.mil ~ population,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
   print(cases.pop(i)) 
   print(pop.lm.cases.cont(i)) 
} #Loops function for continents of interest. 

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1497.85  -345.09  -175.71   -90.24  2662.88 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.970e+02  1.820e+02   2.182   0.0406 *  
## population  1.399e-05  2.434e-06   5.749 1.05e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 820.1 on 21 degrees of freedom
## Multiple R-squared:  0.6115, Adjusted R-squared:  0.593 
## F-statistic: 33.05 on 1 and 21 DF,  p-value: 1.047e-05

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2695.4 -2216.3 -1191.6   828.5 16764.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.007e+03  6.407e+02   4.694 2.63e-05 ***
## population  -6.117e-06  1.972e-05  -0.310    0.758    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3758 on 44 degrees of freedom
## Multiple R-squared:  0.002183,   Adjusted R-squared:  -0.0205 
## F-statistic: 0.09624 on 1 and 44 DF,  p-value: 0.7579

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1496.4 -1408.7 -1201.6   507.7 18248.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.509e+03  4.997e+02   3.020  0.00415 **
## population  -1.384e-06  1.671e-06  -0.828  0.41190   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3236 on 45 degrees of freedom
## Multiple R-squared:  0.01502,    Adjusted R-squared:  -0.006873 
## F-statistic: 0.686 on 1 and 45 DF,  p-value: 0.4119

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -321.11 -232.55 -199.00  -14.54 3069.34 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.283e+02  9.379e+01   3.500 0.000964 ***
## population  -2.899e-06  2.168e-06  -1.337 0.187074    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 564.8 on 52 degrees of freedom
## Multiple R-squared:  0.03323,    Adjusted R-squared:  0.01464 
## F-statistic: 1.787 on 1 and 52 DF,  p-value: 0.1871

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##       1       2       3       4 
##   16.16  -67.22  196.03 -144.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.078e+01  1.313e+02   0.615    0.601
## population  7.275e-06  9.564e-06   0.761    0.526
## 
## Residual standard error: 179.2 on 2 degrees of freedom
## Multiple R-squared:  0.2243, Adjusted R-squared:  -0.1635 
## F-statistic: 0.5785 on 1 and 2 DF,  p-value: 0.5263

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1446.1 -1227.4 -1084.0    11.2  5023.1 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.319e+03  7.569e+02   1.742    0.112
## population  6.354e-06  1.143e-05   0.556    0.591
## 
## Residual standard error: 2204 on 10 degrees of freedom
## Multiple R-squared:  0.02996,    Adjusted R-squared:  -0.06705 
## F-statistic: 0.3088 on 1 and 10 DF,  p-value: 0.5906
cases.pop.den <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = pop.density, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Density"))
} 
pop.den.lm.cases.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totcases.mil ~ pop.density,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.pop.den(i))
  print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest. 

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1199.9  -668.7  -356.4    -7.5  4244.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1292.670    405.934   3.184  0.00446 **
## pop.density   -2.893      1.689  -1.713  0.10149   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1232 on 21 degrees of freedom
## Multiple R-squared:  0.1226, Adjusted R-squared:  0.08078 
## F-statistic: 2.933 on 1 and 21 DF,  p-value: 0.1015

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2356.3 -2006.6  -957.2   929.5 17129.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.633e+03  5.072e+02   5.191  5.4e-06 ***
## pop.density 1.617e-02  1.751e-01   0.092    0.927    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3332 on 43 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0001984,  Adjusted R-squared:  -0.02305 
## F-statistic: 0.008531 on 1 and 43 DF,  p-value: 0.9268

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3520.3 -1236.9 -1147.3   556.9 18445.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 1229.2082   523.4504   2.348   0.0235 *
## pop.density    0.3460     0.3177   1.089   0.2822  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3277 on 43 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.02684,    Adjusted R-squared:  0.004206 
## F-statistic: 1.186 on 1 and 43 DF,  p-value: 0.2822

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -266.99 -236.07 -181.56  -44.54 3127.32 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 272.6243   102.1690   2.668   0.0102 *
## pop.density  -0.1263     0.6192  -0.204   0.8392  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 579.3 on 51 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.000815,   Adjusted R-squared:  -0.01878 
## F-statistic: 0.0416 on 1 and 51 DF,  p-value: 0.8392

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##       1       2       3       4 
##   26.13   12.58  136.11 -174.81 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  273.491    130.823   2.091    0.172
## pop.density   -5.367      4.677  -1.148    0.370
## 
## Residual standard error: 158 on 2 degrees of freedom
## Multiple R-squared:  0.397,  Adjusted R-squared:  0.09555 
## F-statistic: 1.317 on 1 and 2 DF,  p-value: 0.3699

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1791.1 -1201.3  -903.1   -36.4  4919.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   934.07    1096.78   0.852    0.414
## pop.density    25.10      36.77   0.683    0.510
## 
## Residual standard error: 2187 on 10 degrees of freedom
## Multiple R-squared:  0.04454,    Adjusted R-squared:  -0.051 
## F-statistic: 0.4662 on 1 and 10 DF,  p-value: 0.5103

Population Size and COVID-19 Deaths per Million

Population size and population density are evaluated below as confounding variables for COVID-19 deaths. These variables are first analyzed on a full-world scale, and then by continent.

ggplot(data = owid.countries, aes(x = population, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
  labs(title = ("World COVID-19 Deaths per Million People vs. Population Size")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$population, use = "complete.obs") 
## [1] -0.01920185
pop.lm.deaths <- lm(totdeaths.mil ~ population, data = owid.countries)
pop.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
## 
## Coefficients:
## (Intercept)   population  
##   6.344e+01   -1.960e-08
summary.lm(pop.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -63.12  -60.42  -54.86  -29.27 1174.11 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.344e+01  1.307e+01   4.853  2.8e-06 ***
## population  -1.960e-08  7.946e-08  -0.247    0.805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 162.4 on 165 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.0003687,  Adjusted R-squared:  -0.00569 
## F-statistic: 0.06086 on 1 and 165 DF,  p-value: 0.8054
ggplot(data = owid.countries, aes(x = pop.density, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
  labs(title = ("World COVID-19 Deaths per Million People vs. Population Density")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$pop.density, use = "complete.obs") 
## [1] 0.006573727
pop.den.lm.deaths <- lm(totdeaths.mil ~ pop.density, data = owid.countries)
pop.den.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
## 
## Coefficients:
## (Intercept)  pop.density  
##   6.345e+01    6.254e-04
summary.lm(pop.den.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -67.32  -60.86  -56.24  -29.86 1173.75 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.345e+01  1.308e+01   4.851 2.86e-06 ***
## pop.density 6.254e-04  7.475e-03   0.084    0.933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 163.7 on 162 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  4.321e-05,  Adjusted R-squared:  -0.006129 
## F-statistic: 0.007001 on 1 and 162 DF,  p-value: 0.9334
deaths.pop <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = population, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Population Size"))
} 
pop.lm.deaths.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totdeaths.mil ~ population,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.pop(i))
  print(pop.lm.deaths.cont(i))
} #Loops function for continents of interest. 

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.650 -22.027 -15.337   8.501 155.684 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.990e+01  1.203e+01   1.654    0.118    
## population  8.979e-07  1.424e-07   6.306 1.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.09 on 16 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.7131, Adjusted R-squared:  0.6952 
## F-statistic: 39.77 on 1 and 16 DF,  p-value: 1.044e-05

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -295.08 -139.13 -109.91   22.27 1084.80 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1.527e+02  4.546e+01   3.359  0.00165 **
## population  1.196e-06  1.384e-06   0.864  0.39225   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 262.7 on 43 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01707,    Adjusted R-squared:  -0.005785 
## F-statistic: 0.7469 on 1 and 43 DF,  p-value: 0.3923

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.333  -9.482  -6.405  -1.058  81.553 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.171e+01  3.120e+00   3.753 0.000569 ***
## population  -5.163e-09  9.759e-09  -0.529 0.599776    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.77 on 39 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.007125,   Adjusted R-squared:  -0.01833 
## F-statistic: 0.2799 on 1 and 39 DF,  p-value: 0.5998

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.050 -4.417 -2.140  1.130 49.289 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.471e+00  1.514e+00   3.613 0.000734 ***
## population  -2.837e-08  3.370e-08  -0.842 0.404077    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.608 on 47 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.01486,    Adjusted R-squared:  -0.006102 
## F-statistic: 0.7089 on 1 and 47 DF,  p-value: 0.4041

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  4.684e+00        NaN     NaN      NaN
## population  -2.529e-08        NaN     NaN      NaN
## 
## Residual standard error: NaN on 0 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 1 and 0 DF,  p-value: NA

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -87.26 -77.23 -64.89 -32.83 538.27 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.471e+01  6.396e+01   1.168    0.270
## population  4.588e-07  9.662e-07   0.475    0.645
## 
## Residual standard error: 186.2 on 10 degrees of freedom
## Multiple R-squared:  0.02206,    Adjusted R-squared:  -0.07574 
## F-statistic: 0.2255 on 1 and 10 DF,  p-value: 0.6451
deaths.pop.den <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = pop.density, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Population Density"))
} 
pop.den.lm.cases.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totdeaths.mil ~ pop.density,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.pop.den(i))
  print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest.

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.355 -45.177 -22.267   7.791 252.855 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  79.2774    28.0891   2.822   0.0123 *
## pop.density  -0.1761     0.1187  -1.483   0.1574  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 82.43 on 16 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1209, Adjusted R-squared:  0.06593 
## F-statistic:   2.2 on 1 and 16 DF,  p-value: 0.1574

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -168.39 -148.77 -121.59    2.42 1064.84 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 173.722636  40.334921   4.307 9.42e-05 ***
## pop.density  -0.001816   0.013921  -0.130    0.897    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 265 on 43 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0003954,  Adjusted R-squared:  -0.02285 
## F-statistic: 0.01701 on 1 and 43 DF,  p-value: 0.8968

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.516  -9.817  -6.638  -0.171  80.153 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.754973   3.283937   3.884  0.00041 ***
## pop.density -0.001575   0.001856  -0.848  0.40173    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.99 on 37 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.01908,    Adjusted R-squared:  -0.007434 
## F-statistic: 0.7196 on 1 and 37 DF,  p-value: 0.4017

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.894 -4.129 -2.571  0.850 49.621 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 4.501368   1.599068   2.815  0.00716 **
## pop.density 0.002969   0.009547   0.311  0.75723   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.738 on 46 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.002098,   Adjusted R-squared:  -0.0196 
## F-statistic: 0.0967 on 1 and 46 DF,  p-value: 0.7572

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  3.92739        NaN     NaN      NaN
## pop.density  0.03486        NaN     NaN      NaN
## 
## Residual standard error: NaN on 0 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 1 and 0 DF,  p-value: NA

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -118.40  -73.23  -38.69  -21.33  535.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   34.976     91.898   0.381    0.711
## pop.density    2.304      3.081   0.748    0.472
## 
## Residual standard error: 183.2 on 10 degrees of freedom
## Multiple R-squared:  0.05295,    Adjusted R-squared:  -0.04175 
## F-statistic: 0.5591 on 1 and 10 DF,  p-value: 0.4718

GDP and COVID-19 Outcomes

GDP and COVID-19 Cases per Million

First, a linear regression is performed on world COVID-19 cases and GDP. Then, scatter plots with a smooth line showing the relationship between GDP per capita and COVID-19 cases per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = gdp.cap, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
  labs(title = paste0("World COVID-19 Cases per Million People in vs. GDP per Capita")) #COVID-19 cases and GDP per capita on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$gdp.cap, use = "complete.obs") 
## [1] 0.6567062
gdp.lm.cases <- lm(totcases.mil ~ gdp.cap, data = owid.countries)
gdp.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
## 
## Coefficients:
## (Intercept)      gdp.cap  
##  -358.20891      0.08419
summary.lm(gdp.lm.cases) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5365.0  -832.3   -60.2   312.2 15342.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.582e+02  1.985e+02  -1.804   0.0729 .  
## gdp.cap      8.419e-02  7.288e-03  11.552   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1911 on 176 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.4313, Adjusted R-squared:  0.428 
## F-statistic: 133.5 on 1 and 176 DF,  p-value: < 2.2e-16
cases.gdp <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = gdp.cap, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. GDP per Capita"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.gdp(i))
} #Loops function for continents of interest. 

GDP and COVID-19 Deaths per Million

First, a linear regression is performed on world COVID-19 deaths and GDP. Scatter plots with a smooth line showing the relationship between GDP per capita and COVID-19 deaths per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = gdp.cap, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
  labs(title = paste0("World COVID-19 Deaths per Million People in vs. GDP per Capita")) #COVID-19 deaths and GDP per capita on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$gdp.cap, use = "complete.obs") 
## [1] 0.3552196
gdp.lm.deaths <- lm(totdeaths.mil ~ gdp.cap, data = owid.countries)
gdp.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
## 
## Coefficients:
## (Intercept)      gdp.cap  
##    5.378126     0.002755
summary.lm(gdp.lm.deaths) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -314.35  -44.65  -18.93   -7.09 1075.52 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.378e+00  1.642e+01   0.328    0.744    
## gdp.cap     2.755e-03  5.768e-04   4.777 4.04e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 148.3 on 158 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.1262, Adjusted R-squared:  0.1207 
## F-statistic: 22.82 on 1 and 158 DF,  p-value: 4.042e-06
deaths.gdp <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = gdp.cap, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. GDP per Capita"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.gdp(i))
} #Loops function for continents of interest. 

Median Age and COVID-19 Outcomes

Median Age and COVID-19 Cases per Million

First, scatter plot shows all countries and the relationship between median age and COVID-19 outcomes. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 cases per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.

ggplot(data = owid.countries, aes(x = med.age, y = totcases.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Median Age", y = "Total COVID-19 Cases per Million People") +
  labs(title = "World COVID-19 Cases per Million People vs. Median Population Age") #COVID-19 cases vs. median age on world scale. 

cor(owid.countries$totcases.mil, owid.countries$med.age, use = "complete.obs")
## [1] 0.326925
age.lm.cases <- lm(totcases.mil ~ med.age, data = owid.countries)
age.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##    -1161.02        74.91
summary.lm(age.lm.cases) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2317.4  -954.8  -304.1    25.1 18524.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1161.02     518.02  -2.241   0.0263 *  
## med.age        74.91      16.32   4.589 8.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1998 on 176 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.1069, Adjusted R-squared:  0.1018 
## F-statistic: 21.06 on 1 and 176 DF,  p-value: 8.428e-06
exp.model.cases <- lm(log(totcases.mil) ~ med.age, data = owid.countries)
exp.model.cases
## 
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##      1.6876       0.1259
summary.lm(exp.model.cases)
## 
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6445 -1.0443  0.0571  1.1717  4.1880 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.68764    0.43260   3.901 0.000136 ***
## med.age      0.12587    0.01363   9.234  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.668 on 176 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.3263, Adjusted R-squared:  0.3225 
## F-statistic: 85.26 on 1 and 176 DF,  p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.
##    med.age
## 1 30.37978
cases.med.age <- function(x) { 
                    owid.countries %>%
                    mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
                    mutate(med.age <- as.factor(med.age)) %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = location, y = totcases.mil, fill = factor(med.age))) +
                        geom_bar(stat = "identity", position = "dodge") +
                        theme_bw() +
                        labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
                        scale_fill_brewer(palette = "Set1") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Cases per Million People by Country in ", (continent = x), " and Median Age"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.med.age(i))
} #Loops function for continents of interest.

Median Age and COVID-19 Deaths per Million

First, scatter plot shows all countries and the relationship between median age and COVID-19 deaths. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 daeths per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.

ggplot(data = owid.countries, aes(x = med.age, y = totdeaths.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Median Age", y = "Total COVID-19 Deaths per Million People") +
  labs(title = "World COVID-19 Deaths per Million People vs. Median Population Age") #COVID-19 deaths vs. median age on world scale. 

cor(owid.countries$totdeaths.mil, owid.countries$med.age, use = "complete.obs")
## [1] 0.368092
age.lm.deaths <- lm(totdeaths.mil ~ med.age, data = owid.countries)
age.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##    -101.444        4.981
summary.lm(age.lm.deaths) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -131.55  -53.94  -18.72   10.14  710.08 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -101.4439    31.9989  -3.170  0.00183 ** 
## med.age        4.9813     0.9947   5.008 1.44e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 118.3 on 160 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.1355, Adjusted R-squared:  0.1301 
## F-statistic: 25.08 on 1 and 160 DF,  p-value: 1.442e-06
exp.model.deaths <- lm(log(totdeaths.mil) ~ med.age, data = owid.countries)
exp.model.deaths
## 
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##     -2.1782       0.1375
summary.lm(exp.model.deaths)
## 
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8493 -1.0508  0.1557  1.0573  4.6191 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.17815    0.44962  -4.844 2.98e-06 ***
## med.age      0.13752    0.01398   9.839  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.663 on 160 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.377,  Adjusted R-squared:  0.3731 
## F-statistic:  96.8 on 1 and 160 DF,  p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.
##    med.age
## 1 30.37978
deaths.med.age <- function(x) { 
                    owid.countries %>%
                    mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
                    mutate(med.age <- as.factor(med.age)) %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = location, y = totdeaths.mil, fill = factor(med.age))) +
                        geom_bar(stat = "identity", position = "dodge") +
                        theme_bw() +
                        labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
                        scale_fill_brewer(palette = "Set1") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Deaths per Million People by Country in ", (continent = x), " and Median Age"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.med.age(i))
} #Loops function for continents of interest.

Smoking and COVID-19 Outcomes

Smoking and COVID-19 Cases per Million

Graphs below show percentages of female and male smokers by country and the correlation to COVID-19 cases.

ggplot(data = owid.countries, aes(x = fem.smokers, y = totcases.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Cases per Million People") +
  labs(title = "World COVID-19 Cases per Million People vs. Percentage of Female Smokers") #COVID-19 cases vs. female smokers on world scale. 

cor(owid.countries$totcases.mil, owid.countries$fem.smokers, use = "complete.obs") 
## [1] 0.2013998
ggplot(data = owid.countries, aes(x = male.smokers, y = totcases.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Cases per Million People") +
  labs(title = "World COVID-19 Cases per Million People vs. Percentage of Male Smokers") #COVID-19 cases vs. male smokers on world scale. 

cor(owid.countries$totcases.mil, owid.countries$male.smokers, use = "complete.obs") 
## [1] -0.06492578
cases.smoking.f <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = fem.smokers, y = totcases.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Cases per Million People") +
                      labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.smoking.f(i)) #Apply function to each continent. 
}

cases.smoking.m <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = male.smokers, y = totcases.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Cases per Million People") +
                      labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.smoking.m(i)) #Apply function to each continent. 
}

Smoking and COVID-19 Deaths per Million

Graphs below show percentages of female and male smokers in the world and by country and the correlation to COVID-19 deaths.

ggplot(data = owid.countries, aes(x = fem.smokers, y = totdeaths.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
  labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Female Smokers") #COVID-19 deaths vs. female smokers on world scale. 

cor(owid.countries$totdeaths.mil, owid.countries$fem.smokers, use = "complete.obs") 
## [1] 0.369769
ggplot(data = owid.countries, aes(x = male.smokers, y = totdeaths.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
  labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Male Smokers") #COVID-19 deaths vs. male smokers on world scale. 

cor(owid.countries$totdeaths.mil, owid.countries$male.smokers, use = "complete.obs") 
## [1] -0.09632521
deaths.smoking.f <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = fem.smokers, y = totdeaths.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
                      labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.smoking.f(i)) #Apply function to each continent. 
}

deaths.smoking.m <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = male.smokers, y = totdeaths.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
                      labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.smoking.m(i)) #Apply function to each continent. 
}

#Smoking likely not significant: when adjusting for GDP, statistical significance actually goes down. Same occurs when applied code to total deaths (code not shown).
summary(lm(totcases.mil ~ fem.smokers + gdp.cap, data = owid.countries)) #Female smokers with and without ajdusting for GDP. 
## 
## Call:
## lm(formula = totcases.mil ~ fem.smokers + gdp.cap, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5439.7  -775.0  -170.8   266.3 10198.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -190.31965  219.81352  -0.866    0.388    
## fem.smokers  -21.23930   14.34715  -1.480    0.141    
## gdp.cap        0.08348    0.00719  11.611   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1637 on 135 degrees of freedom
##   (48 observations deleted due to missingness)
## Multiple R-squared:  0.5139, Adjusted R-squared:  0.5067 
## F-statistic: 71.35 on 2 and 135 DF,  p-value: < 2.2e-16
summary(lm(totcases.mil ~ fem.smokers, data = owid.countries))
## 
## Call:
## lm(formula = totcases.mil ~ fem.smokers, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2456.1 -1111.7  -872.8   341.9 18794.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   921.37     284.94   3.234  0.00153 **
## fem.smokers    46.61      19.29   2.415  0.01703 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2384 on 138 degrees of freedom
##   (46 observations deleted due to missingness)
## Multiple R-squared:  0.04056,    Adjusted R-squared:  0.03361 
## F-statistic: 5.834 on 1 and 138 DF,  p-value: 0.01703
summary(lm(totcases.mil ~ male.smokers + gdp.cap, data = owid.countries)) #Male smokers with and without adjusting for GDP. 
## 
## Call:
## lm(formula = totcases.mil ~ male.smokers + gdp.cap, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5076.1  -766.7   -47.3   311.9 10718.1 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.141e+02  3.966e+02  -0.792    0.430    
## male.smokers -2.003e+00  1.021e+01  -0.196    0.845    
## gdp.cap       8.041e-02  6.692e-03  12.016   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1618 on 133 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.5231, Adjusted R-squared:  0.516 
## F-statistic: 72.95 on 2 and 133 DF,  p-value: < 2.2e-16
summary(lm(totcases.mil ~ male.smokers, data = owid.countries))
## 
## Call:
## lm(formula = totcases.mil ~ male.smokers, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1655.0 -1295.8  -953.0   571.4 18299.1 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   1762.69     534.72   3.296  0.00125 **
## male.smokers   -11.47      15.12  -0.759  0.44931   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2426 on 136 degrees of freedom
##   (48 observations deleted due to missingness)
## Multiple R-squared:  0.004215,   Adjusted R-squared:  -0.003107 
## F-statistic: 0.5757 on 1 and 136 DF,  p-value: 0.4493

Diabetes Prevalence and COVID-19 Outcomes

Diabetes Prevalence and COVID-19 Cases per Million

First, a linear regression is performed on world COVID-19 cases and diabetes prevalence. Then, scatter plots with a smooth line showing the relationship of diabetes prevalence and COVID-19 cases per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = diab.prev, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Diabetes Prevalence", y = "Total COVID-19 Cases per Million People") +
  labs(title = paste0("World COVID-19 Cases per Million People in vs. Diabetes Prevalence")) #COVID-19 cases and diabetes prevalence on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$diab.prev, use = "complete.obs") 
## [1] 0.1093217
diab.lm.cases <- lm(totcases.mil ~ diab.prev, data = owid.countries)
diab.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ diab.prev, data = owid.countries)
## 
## Coefficients:
## (Intercept)    diab.prev  
##      711.48        74.21
summary.lm(diab.lm.cases) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totcases.mil ~ diab.prev, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2082.2 -1148.3  -879.2   259.4 18641.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   711.48     430.99   1.651    0.101
## diab.prev      74.21      50.29   1.476    0.142
## 
## Residual standard error: 2576 on 180 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.01195,    Adjusted R-squared:  0.006462 
## F-statistic: 2.177 on 1 and 180 DF,  p-value: 0.1418
cases.diab <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = diab.prev, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Diabetes Prevalence", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Diabetes Prevalence"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.diab(i))
} #Loops function for continents of interest. 

Diabetes Prevalence and COVID-19 Deaths per Million

First, a linear regression is performed on world COVID-19 deaths and diabetes prevalence. Scatter plots with a smooth line showing the relationship of diabetes prevalence and COVID-19 deaths per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = diab.prev, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Diabetes Prevalence", y = "Total COVID-19 Deaths per Million People") +
  labs(title = paste0("World COVID-19 Deaths per Million People in vs. Diabetes Prevalence")) #COVID-19 deaths and diabetes prevalence on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$diab.prev, use = "complete.obs") 
## [1] -0.1266164
diab.lm.deaths <- lm(totdeaths.mil ~ diab.prev, data = owid.countries)
diab.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ diab.prev, data = owid.countries)
## 
## Coefficients:
## (Intercept)    diab.prev  
##     105.516       -5.521
summary.lm(diab.lm.deaths) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totdeaths.mil ~ diab.prev, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -99.80  -65.23  -46.68   -6.19 1163.17 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  105.516     28.750   3.670 0.000329 ***
## diab.prev     -5.521      3.398  -1.625 0.106183    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 162.4 on 162 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  0.01603,    Adjusted R-squared:  0.009958 
## F-statistic: 2.639 on 1 and 162 DF,  p-value: 0.1062
deaths.diab <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = diab.prev, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Diabetes Prevalence", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Diabetes Prevalence"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.diab(i))
} #Loops function for continents of interest. 

Cardiovascular Death and COVID-19 Outcomes

Cardiovascular Death and COVID-19 Cases per Million

First, a linear regression is performed on world COVID-19 cases and cardiovascular death. Then, scatter plots with a smooth line showing the relationship between cardiovascular death and COVID-19 cases per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = card.death, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Cardiovascular Death", y = "Total COVID-19 Cases per Million People") +
  labs(title = paste0("World COVID-19 Cases per Million People in vs. Cardiovascular Death")) #COVID-19 cases and cardiovascular death on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$card.death, use = "complete.obs") 
## [1] -0.3278695
card.lm.cases <- lm(totcases.mil ~ card.death, data = owid.countries)
card.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ card.death, data = owid.countries)
## 
## Coefficients:
## (Intercept)   card.death  
##    2750.669       -6.151
summary.lm(card.lm.cases) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totcases.mil ~ card.death, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2130.0 -1126.2  -637.7   374.0 18089.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2750.669    377.291   7.291 9.89e-12 ***
## card.death    -6.151      1.332  -4.617 7.45e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2086 on 177 degrees of freedom
##   (7 observations deleted due to missingness)
## Multiple R-squared:  0.1075, Adjusted R-squared:  0.1025 
## F-statistic: 21.32 on 1 and 177 DF,  p-value: 7.452e-06
cases.card <- function(x) { 
                  owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = card.death, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Cardiovascular Death", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Cardiovascular Death"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.card(i))
} #Loops function for continents of interest. 

Cardiovascular Death and COVID-19 Deaths per Million

First, a linear regression is performed on world COVID-19 deaths and cardiovascular death rates from 2017. Scatter plots with a smooth line showing the relationship between cardiovascular death rates and COVID-19 deaths per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = card.death, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "purple") +
  theme_bw() +
  labs(x = "Cardiovascular Death", y = "Total COVID-19 Deaths per Million People") +
  labs(title = paste0("World COVID-19 Deaths per Million People in vs. Cardiovascular Death")) #COVID-19 deaths and cardiovascular death on world scale. Red line is smooth fit; purple line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$card.death, use = "complete.obs") 
## [1] -0.3752894
card.lm.deaths <- lm(totdeaths.mil ~ card.death, data = owid.countries)
card.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ card.death, data = owid.countries)
## 
## Coefficients:
## (Intercept)   card.death  
##    164.9514      -0.4312
summary.lm(card.lm.deaths) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totdeaths.mil ~ card.death, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -123.63  -66.95  -36.31   22.85  701.45 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 164.9514    23.4724   7.027 5.73e-11 ***
## card.death   -0.4312     0.0842  -5.121 8.62e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 126 on 160 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.1408, Adjusted R-squared:  0.1355 
## F-statistic: 26.23 on 1 and 160 DF,  p-value: 8.622e-07
deaths.card <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = card.death, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "purple") +
                        theme_bw() +
                        labs(x = "Cardiovascular Death", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Cardiovascular Death"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.card(i))
} #Loops function for continents of interest. 

Geospatial Analysis

Downloading World Map File

World map file is read in and prepared for overlaying with COVID-19 data.

library(sf)
#File downloaded from https://hub.arcgis.com/datasets/esri::world-countries-generalized-1/explore and moved to working directory. File was unzipped and folder renamed to "world_maps". 
world.sf <- st_read("world_maps/country_gen_trim.shp")  # Read shapefile as an sf object
## Reading layer `country_gen_trim' from data source `/Users/alanaschreibman/COVID-19_NO2/world_maps/country_gen_trim.shp' using driver `ESRI Shapefile'
## Simple feature collection with 256 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -89 xmax: 180 ymax: 83.6236
## Geodetic CRS:  WGS 84
class(world.sf)
## [1] "sf"         "data.frame"
head(world.sf)
## Simple feature collection with 6 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -176.6431 ymin: -21.94084 xmax: -138.8098 ymax: 0.808609
## Geodetic CRS:  WGS 84
##   OBJECTID FIPS_CNTRY ISO_2DIGIT ISO_3DIGIT             NAME    COUNTRYAFF
## 1        1         AQ         AS        ASM   American Samoa United States
## 2        2         WQ         UM        UMI            Baker United States
## 3        3         CW         CK        COK     Cook Islands   New Zealand
## 4        4         FP         PF        PYF French Polynesia        France
## 5        5         WQ         UM        UMI          Howland United States
## 6        6         WQ         UM        UMI           Jarvis United States
##   CONTINENT SHAPE_Leng   SHAPE_Area                       geometry
## 1   Oceania 0.60012436 1.371972e-02 MULTIPOLYGON (((-170.7439 -...
## 2   Oceania 0.02887532 3.430265e-05 MULTIPOLYGON (((-176.4614 0...
## 3   Oceania 0.98066416 1.307346e-02 MULTIPOLYGON (((-159.747 -2...
## 4   Oceania 3.93021062 1.753321e-01 MULTIPOLYGON (((-149.1792 -...
## 5   Oceania 0.05303007 1.811934e-04 MULTIPOLYGON (((-176.6362 0...
## 6   Oceania 0.09758213 6.658148e-04 MULTIPOLYGON (((-160.0211 -...
world.geo <- st_geometry(world.sf)
plot(world.geo)

plot(world.geo, col = "lemonchiffon2")

plot(world.geo, lwd = 0.5, border = "red")

Geospatial Distribution of COVID-19 Cases

Joins OWID data set with world map data frame and then use country cases to shade countries accordingly.

library(RColorBrewer)
library(viridis)

my_theme <- function() {
  theme_minimal() +                                  
  theme(axis.line = element_blank(),
  plot.title  = element_text(size=30),
  axis.text = element_blank(),                 
  axis.title = element_blank(),
  panel.grid = element_line(color = "white"),  
  legend.key.size = unit(0.8, "cm"),          
  legend.text = element_text(size = 20),      
  legend.title = element_text(size = 20))
}

myPalette <- colorRampPalette(brewer.pal(9, "BuPu")) 

world.sf[105, "NAME"] <- "Democratic Republic of Congo"
world.sf <- world.sf %>%
  rename(location = NAME) %>% 
  inner_join(owid.countries, world.sf, by = "location") #Joined OWID data set with .sph file for geospatial analysis.  
ggplot(data = world.sf, aes(fill = totcases.mil),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Population-Adjusted Distribution of COVID-19 Cases") +
  scale_fill_gradientn(name = "COVID-19 \ncases (per million)",  
                    limits = c(0, 6500), 
                    colours = myPalette(100)) #Plot of world distribution of COVID-19 cases with color shading. 

Geospatial Distribution of COVID-19 Deaths

Uses merged data frame and shades deaths by country accordingly.

ggplot(data = world.sf, aes(fill = totdeaths.mil),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Population-Adjusted Distribution of COVID-19 Deaths") +
  scale_fill_gradientn(name = "COVID-19 \ndeaths (per million)",  
                    limits = c(0, 700), 
                    colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. 

Geospatial Distribution of GDP per Capita

Uses merged data frame and shades GDP per capita by country accordingly.

ggplot(data = world.sf, aes(fill = gdp.cap),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Global Distribution of GDP per Capita") +
  scale_fill_gradientn(name = "GDP \nper capita",  
                    limits = c(0, 60000), 
                    colours = myPalette(100)) 

Geospatial Distribution of Population Density

Uses merged data frame and shades population density by country accordingly.

ggplot(data = world.sf, aes(fill = pop.density),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Global Distribution of Population Density") +
  scale_fill_gradientn(name = "Population \ndensity",  
                    limits = c(0, 450), 
                    colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. 

Geospatial Distribution of Median Age

Uses merged data frame and shades median age by country accordingly.

ggplot(data = world.sf, aes(fill = med.age),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Global Distribution of Median Age") +
  scale_fill_gradientn(name = "Median \n age",  
                    limits = c(0, 50), 
                    colours = myPalette(100)) 

Geospatial Distribution of Smoking

Uses merged data frame and shades male and female smoking percentages by country accordingly.

ggplot(data = world.sf, aes(fill = fem.smokers),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Percentage of Female Smokers by Country") +
  scale_fill_gradientn(name = "Female \nsmokers (%)",  
                    limits = c(0, 40), 
                    colours = myPalette(100)) 

ggplot(data = world.sf, aes(fill = male.smokers),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Percentage of Male Smokers by Country") +
  scale_fill_gradientn(name = "Male \nsmokers (%)",  
                    limits = c(0, 80), 
                    colours = myPalette(100)) 

Geospatial Distribution of Diabetes

Uses merged data frame and shades diabetes prevalence by country accordingly.

ggplot(data = world.sf, aes(fill = diab.prev),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Diabetes Prevalence by Country") +
  scale_fill_gradientn(name = "Diabetes \nprevalence (%)",  
                    limits = c(0, 20), 
                    colours = myPalette(100)) 

Geospatial Distribution of Cardiovascular Death

Uses merged data frame and shades indicence of cardiovascular death by country accordingly.

ggplot(data = world.sf, aes(fill = card.death),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Incidence of Cardiovascular Death by Country") +
  scale_fill_gradientn(name = "Diabetes \nprevalence (%)",  
                    limits = c(0, 800), 
                    colours = myPalette(100))